The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
Changes 07
MANIFEST 01
META.yml 11
README 11
lib/HTML/Defang.pm 7098
t/01_basic.t 211
t/06_unicode.t 055
7 files changed (This is a version diff) 74174
@@ -1,5 +1,12 @@
 Revision history for Perl extension HTML::Defang.
 
+1.04  Mon Jan 03 12:00:00 2011
+	- Change defang_and_add_to_output to delay the defanging until after the current tag is actually emmitted to the output
+	- If url callback has marked the attribute as to be defanged, skip result of attribute callback
+	- Track <a> tags as part of mismatched tag tracking
+	- Replace magic constants of 0, 1 and 2 in HTML::Defang with exported constants DEFANG_NONE, DEFANG_ALWAYS and DEFANG_DEFAULT
+	- allow attribute values upto 16384 chars long
+
 1.03  Mon Jun 14 16:22:35 2010
 	- fix incorrect tag closing when "/" appears as attribute key
 	- handle deep span nests with the same attrs, not just no attrs
@@ -8,4 +8,5 @@ t/02_xss.t
 t/03_styles.t
 t/04_imports.t
 t/05_callbacks.t
+t/06_unicode.t
 META.yml                                 Module meta-data (added by MakeMaker)
@@ -1,6 +1,6 @@
 --- #YAML:1.0
 name:                HTML-Defang
-version:             1.03
+version:             1.04
 abstract:            Cleans HTML as well as CSS of scripting and other executable contents, and neutralises XSS attacks.
 license:             ~
 author:              
@@ -1,4 +1,4 @@
-HTML-Defang version 1.03
+HTML-Defang version 1.04
 ========================
 
 This module accepts an input HTML and/or CSS string and removes any
@@ -26,16 +26,26 @@ HTML::Defang - Cleans HTML as well as CSS of scripting and other executable cont
   # Callback for custom handling specific HTML tags  
   sub DefangTagsCallback {
     my ($Self, $Defang, $OpenAngle, $lcTag, $IsEndTag, $AttributeHash, $CloseAngle, $HtmlR, $OutR) = @_;
-    return 1 if $lcTag eq 'br';    # Explicitly defang this tag, eventhough safe
-    return 0 if $lcTag eq 'embed'; # Explicitly whitelist this tag, eventhough unsafe
-    return 2 if $lcTag eq 'img';   # I am not sure what to do with this tag, so process as HTML::Defang normally would
+
+    # Explicitly defang this tag, eventhough safe
+    return DEFANG_ALWAYS if $lcTag eq 'br';
+
+    # Explicitly whitelist this tag, eventhough unsafe
+    return DEFANG_NONE if $lcTag eq 'embed';
+
+    # I am not sure what to do with this tag, so process as HTML::Defang normally would
+    return DEFANG_DEFAULT if $lcTag eq 'img';
   }
 
   # Callback for custom handling URLs in HTML attributes as well as style tag/attribute declarations
   sub DefangUrlCallback {
     my ($Self, $Defang, $lcTag, $lcAttrKey, $AttrValR, $AttributeHash, $HtmlR) = @_;
-    return 0 if $$AttrValR =~ /safesite.com/i; # Explicitly allow this URL in tag attributes or stylesheets
-    return 1 if $$AttrValR =~ /evilsite.com/i; # Explicitly defang this URL in tag attributes or stylesheets
+
+    # Explicitly allow this URL in tag attributes or stylesheets
+    return DEFANG_NONE if $$AttrValR =~ /safesite.com/i;
+
+    # Explicitly defang this URL in tag attributes or stylesheets
+    return DEFANG_ALWAYS if $$AttrValR =~ /evilsite.com/i;
   }
 
   # Callback for custom handling style tags/attributes
@@ -47,8 +57,12 @@ HTML::Defang - Cleans HTML as well as CSS of scripting and other executable cont
       foreach my $KeyValueRules (@$SelectorRule) {
         foreach my $KeyValueRule (@$KeyValueRules) {
           my ($Key, $Value) = @$KeyValueRule;
-          $$KeyValueRule[2] = 1 if $Value =~ '!important';                  # Comment out any '!important' directive
-          $$KeyValueRule[2] = 1 if $Key =~ 'position' && $Value =~ 'fixed'; # Comment out any 'position=fixed;' declaration
+
+          # Comment out any '!important' directive
+          $$KeyValueRule[2] = DEFANG_ALWAYS if $Value =~ '!important';
+
+          # Comment out any 'position=fixed;' declaration
+          $$KeyValueRule[2] = DEFANG_ALWAYS if $Key =~ 'position' && $Value =~ 'fixed';
         }
       }
       $i++;
@@ -58,9 +72,14 @@ HTML::Defang - Cleans HTML as well as CSS of scripting and other executable cont
   # Callback for custom handling HTML tag attributes
   sub DefangAttribsCallback {
     my ($Self, $Defang, $lcTag, $lcAttrKey, $AttrValR, $HtmlR) = @_;
-    $$AttrValR = '0' if $lcAttrKey eq 'border';  # Change all 'border' attribute values to zero.
-    return 1 if $lcAttrKey eq 'src';             # Defang all 'src' attributes
-    return 0;
+
+    # Change all 'border' attribute values to zero.
+    $$AttrValR = '0' if $lcAttrKey eq 'border';
+
+    # Defang all 'src' attributes
+    return DEFANG_ALWAYS if $lcAttrKey eq 'src';
+
+    return DEFANG_NONE;
   }
 
 =head1 DESCRIPTION
@@ -99,13 +118,17 @@ HTML::Defang can defang whole tags, any attribute in a tag, any URL that appear
 
 use Exporter;
 our @ISA = ('Exporter');
-%EXPORT_TAGS = (all => [qw(@FormTags)]);
+%EXPORT_TAGS = (all => [qw(@FormTags DEFANG_NONE DEFANG_ALWAYS DEFANG_DEFAULT)]);
 Exporter::export_ok_tags('all');
 
 use strict;
 use warnings;
 
-our $VERSION=1.03;
+our $VERSION=1.04;
+
+use constant DEFANG_NONE => 0;
+use constant DEFANG_ALWAYS => 1;
+use constant DEFANG_DEFAULT => 2;
 
 use Encode;
 
@@ -117,7 +140,7 @@ our @FormTags = qw(form input textarea select option button fieldset label legen
 # Some regexps for matching HTML tags + key=value attributes
 my $AttrKeyStartLineRE = qr/(?:[^=<>\s\/\\]{1,}|[\/]\s*(?==))/;
 my $AttrKeyRE = qr/(?<=[\s'"\/])$AttrKeyStartLineRE/;
-my $AttrValRE = qr/[^>\s'"`][^>\s]*|'[^']{0,2000}?'|"[^"]{0,2000}?"|`[^`]{0,2000}?`/;
+my $AttrValRE = qr/[^>\s'"`][^>\s]*|'[^']{0,16384}?'|"[^"]{0,16384}?"|`[^`]{0,16384}?`/;
 my $AttributesRE = qr/(?:(?:$AttrKeyRE\s*)?(?:=\s*$AttrValRE\s*)?)*/;
 my $TagNameRE = qr/[A-Za-z][A-Za-z0-9\#\&\;\:\!_-]*/;
 
@@ -155,7 +178,7 @@ my %Rules =
   "form-method"  => qr/^(get|post)$/i,
   "frame"        => qr/^(void|above|below|hsides|vsides|lhs|rhs|box|border)$/i,
   # href: Not javascript, vbs or vbscript
-  "href"         => qr/^([A-Za-z]*script|.*\&{|mocha|hcp|opera|about|smb|\/dev\/)/i,
+  "href"         => [ qr/(?i:^([a-z]*script\s*:|.*\&{|mocha|hcp|opera\s*:|about\s*:|smb|\/dev\/|<))|[^\x00-\x7f]/ ],
   "usemap-href"  => qr/^#[A-Za-z0-9_.-]+$/,  # this is not really a href at all!
   "input-size"   => qr/^(\d{1,4})$/, # some browsers freak out with very large widgets
   "input-type"   => qr/^(button|checkbox|file|hidden|image|password|radio|readonly|reset|submit|text)$/i,
@@ -181,14 +204,14 @@ my %Rules =
 #  "style"        => qr/expression|eval|script:|mocha:|\&{|\@import|(?<!background-)position:|background-image/i, # XXX there are probably a million more ways to cause trouble with css!
   "style"        => qr/^.*$/s,
 #kc In addition to this, we could strip all 'javascript:|expression|' etc. from all attributes(in attribute_cleanup())
-  "stylesheet"   => qr/expression|eval|script:|mocha:|\&{|\@import/i, # stylesheets are forbidden if Embedded => 1.  css positioning can be allowed in an iframe.
+  "stylesheet"   => [ qr/expression|eval|script:|mocha:|\&{|\@import/i ], # stylesheets are forbidden if Embedded => 1.  css positioning can be allowed in an iframe.
   # NB see also `process_stylesheet' below
-  "style-type"   => qr/script|mocha/i,
+  "style-type"   => [ qr/script|mocha/i ],
   "size"         => qr/^[\d.]+(px|%)?$/i,
   "target"       => qr/^[A-Za-z0-9_][A-Za-z0-9_.-]*$/,
   "base-href"    => qr/^https?:\/\/[\w.\/]+$/,
   "anything"     => qr/^.*$/, #[ 0, 0 ],
-  "meta:content" => [ 0, 0 ],
+  "meta:content" => [ qr// ],
 );
 
 my %CommonAttributes =
@@ -614,7 +637,7 @@ my %CharToEntity = reverse %EntityToChar;
 my %QuoteRe = ('"' => qr/(["&<>])/, "'" => qr/(['&<>])/, "" => qr/(["&<>])/);
 
 # Default list of mismatched tags to track
-my %MismatchedTags = map { $_ => 1 } qw(table tbody thead tr td th font div span pre center blockquote dl ul ol h1 h2 h3 h4 h5 h6 fieldset tt p noscript);
+my %MismatchedTags = map { $_ => 1 } qw(table tbody thead tr td th font div span pre center blockquote dl ul ol h1 h2 h3 h4 h5 h6 fieldset tt p noscript a);
 
 # When fixing mismatched tags, sometimes a close tag
 #  shouldn't close all the way out
@@ -856,15 +879,15 @@ If $Defang->{tags_callback} exists, and HTML::Defang has parsed a tag preset in
 
 =over 4
 
-=item 0
+=item DEFANG_NONE
 
 The current tag will not be defanged.
 
-=item 1
+=item DEFANG_ALWAYS
 
 The current tag will be defanged.
 
-=item 2
+=item DEFANG_DEFAULT
 
 The current tag will be processed normally by HTML:Defang as if there was no callback method specified.
 
@@ -898,15 +921,15 @@ See $AttributeHash for details of decoding.
 
 =over 4
 
-=item 0
+=item DEFANG_NONE
 
 The current attribute will not be defanged.
 
-=item 1
+=item DEFANG_ALWAYS
 
 The current attribute will be defanged.
 
-=item 2
+=item DEFANG_DEFAULT
 
 The current attribute will be processed normally by HTML:Defang as if there was no callback method specified.
 
@@ -943,15 +966,15 @@ rather than just a scalar value. You can add attributes (remember to make it a s
 
 =over 4
 
-=item 0
+=item DEFANG_NONE
 
 The current URL will not be defanged.
 
-=item 1
+=item DEFANG_ALWAYS
 
 The current URL will be defanged.
 
-=item 2
+=item DEFANG_DEFAULT
 
 The current URL will be processed normally by HTML:Defang as if there was no callback method specified.
 
@@ -984,12 +1007,12 @@ The declaration blocks will get parsed into the following data structure:
 
   [
     [
-      [ "b", "c", 2],
-      [ "d", "e", 2]
+      [ "b", "c", DEFANG_DEFAULT ],
+      [ "d", "e", DEFANG_DEFAULT ]
     ],
     [
-      [ "k", "l", 2],
-      [ "m", "n", 2]
+      [ "k", "l", DEFANG_DEFAULT ],
+      [ "m", "n", DEFANG_DEFAULT ]
     ]
   ]
 
@@ -997,13 +1020,13 @@ So, generally each property:value pair in a declaration is parsed into an array
 
   ["property", "value", X]
 
-where X can be 0, 1 or 2, and 2 the default value. A client can manipulate this value to instruct HTML::Defang to defang this property:value pair.
+where X can be DEFANG_NONE, DEFANG_ALWAYS or DEFANG_DEFAULT, and DEFANG_DEFAULT the default value. A client can manipulate this value to instruct HTML::Defang to defang this property:value pair.
 
-0 - Do not defang
+DEFANG_NONE - Do not defang
 
-1 - Defang the style:property value
+DEFANG_ALWAYS - Defang the style:property value
 
-2 - Process this as if there is no callback specified
+DEFANG_DEFAULT - Process this as if there is no callback specified
 
 =item I<$IsAttr>
 
@@ -1153,7 +1176,7 @@ sub defang {
         }
 
         NoParseAttributes:
-        my $Defang = 1;
+        my $Defang = DEFANG_ALWAYS;
 
         my $TagOps = $Tags{lc $Tag};
 
@@ -1178,10 +1201,10 @@ sub defang {
         my $TagContent = $TagTrail . join("", grep { defined } map { @$_ } @Attributes);
 
         $Defang ||= $Self->track_tags(\$O, \$I, $TagOps, \$OpenAngle, $IsEndTag, $Tag, \$TagContent)
-          if $Self->{fix_mismatched_tags} && ($Defang == 2 || $Defang == 0);
+          if $Self->{fix_mismatched_tags} && ($Defang != DEFANG_ALWAYS);
 
         # defang unknown tags
-        if ($Defang) {
+        if ($Defang != DEFANG_NONE) {
           warn "defang Defanging $Tag" if $Debug;
           $Tag = $Self->{defang_string} . $Tag
             if $Self->{allow_double_defang}
@@ -1289,6 +1312,9 @@ sub defang {
       if (exists $Self->{AppendOutput}) {
         $O .= delete $Self->{AppendOutput};
       }
+      if (exists $Self->{DelayedAppendOutput}) {
+        $O .= $Self->defang(delete $Self->{DelayedAppendOutput});
+      }
       next;
     }
   
@@ -1344,7 +1370,8 @@ sub add_to_output {
 
 sub defang_and_add_to_output {
   my $Self = shift;
-  $Self->add_to_output($Self->defang(shift));
+  $Self->{DelayedAppendOutput} = '' if !defined $Self->{DelayedAppendOutput};
+  $Self->{DelayedAppendOutput} .= shift;
 }
 
 =item B<INTERNAL METHODS>
@@ -1429,8 +1456,7 @@ sub defang_script {
   }
 
   # Also defang tag
-  return 1;
-
+  return DEFANG_ALWAYS;
 }
 
 =item I<defang_style($OutR, $HtmlR, $TagOps, $OpenAngle, $IsEndTag, $Tag, $TagTrail, $Attributes, $CloseAngle, $IsAttr)>
@@ -1465,7 +1491,7 @@ sub defang_style {
   warn "defang_style Tag=$Tag IsEndTag=$IsEndTag IsAttr=$IsAttr" if $Self->{Debug};
 
   # Nothing to do if end tag
-  return 0 if !$IsAttr && $IsEndTag;
+  return DEFANG_NONE if !$IsAttr && $IsEndTag;
 
   # Do all style work in byte mode
   use bytes;
@@ -1561,8 +1587,7 @@ sub defang_style {
   }
    
   # We don't want <style> tags to be defanged
-  return 0;
-
+  return DEFANG_NONE;
 }
 
 =item I<cleanup_style($StyleString)>
@@ -1686,7 +1711,7 @@ sub defang_stylerule {
       warn "defang_stylerule Key=$Key Value=$Value Separator=$Separator ValueEnd=$ValueEnd" if $Self->{Debug};
       # Store everything except style property and value in a hash
       $StyleKeyExtraData{lc $Key} = [$KeyPilot, $Separator, $QuoteStart, $QuoteEnd, $ValueEnd, $ValueTrail];
-      my $DefangStyleRule = 2;
+      my $DefangStyleRule = DEFANG_DEFAULT;
 
       # If the style value has a URL in it and URL callback has been supplied, make a url_callback
       if ($Self->{url_callback} && $Value =~ m/\s*url\(\s*((?:['"])?)(.*?)\1\s*\)/i) {
@@ -1768,10 +1793,10 @@ sub defang_stylerule {
         ($Separator, $ValueEnd, $ValueTrail) = (":", ";", " ") unless $v;
         
         # Flag to defang if a url, expression or unallowed character found
-        if ($Defang == 2) {
-          $Defang = $Value =~ m{^\s*[a-z0-9%!"'`:()#\s.,\/+-]+\s*;?\s*$}i ? 0 : 1;
-          $Defang = $Value =~ m{^\s*url\s*\(}i ? 1 : $Defang;
-          $Defang = $Value =~ m{^\s*expression\s*\(}i ? 1 : $Defang;
+        if ($Defang == DEFANG_DEFAULT) {
+          $Defang = $Value =~ m{^\s*[a-z0-9%!"'`:()#\s.,\/+-]+\s*;?\s*$}i ? DEFANG_NONE : DEFANG_ALWAYS;
+          $Defang = $Value =~ m{^\s*url\s*\(}i ? DEFANG_ALWAYS : $Defang;
+          $Defang = $Value =~ m{^\s*expression\s*\(}i ? DEFANG_ALWAYS : $Defang;
         }
 
         ($KeyPilot, $Key, $Separator, $QuoteStart, $Value, $QuoteEnd, $ValueEnd, $ValueTrail) =
@@ -1779,8 +1804,8 @@ sub defang_stylerule {
             ($KeyPilot, $Key, $Separator, $QuoteStart, $Value, $QuoteEnd, $ValueEnd, $ValueTrail);
         
         # Comment out the style property-value pair if $Defang
-        $Key = $Defang ? "/*" . $Key : $Key;
-        $ValueEnd = $Defang ? $ValueEnd . "*/" : $ValueEnd;
+        $Key = $Defang != DEFANG_NONE ? "/*" . $Key : $Key;
+        $ValueEnd = $Defang != DEFANG_NONE ? $ValueEnd . "*/" : $ValueEnd;
 
         # Put the rule together back
         if (defined($Key)) {
@@ -1841,7 +1866,7 @@ sub defang_attributes {
       $AttribRule = $Tags{$lcTag}{$lcAttrKey};
     }
 
-    my $DefangAttrib = 2;
+    my $DefangAttrib = DEFANG_DEFAULT;
 
     $AttribRule = $CommonAttributes{$lcAttrKey} unless $AttribRule;
     warn "defang_attributes AttribRule=$AttribRule" if $Debug;
@@ -1862,33 +1887,37 @@ sub defang_attributes {
     # If a attribute callback is supplied and its interested in this attribute, we make a attribs_callback
     if ($Self->{attribs_callback} && exists($Self->{attribs_to_callback}->{$lcAttrKey})) {
       warn "defang_attributes Making attribute callback for Tag=$Tag AttrKey=$AttrKey" if $Debug;
-      $DefangAttrib = $Self->{attribs_callback}->($Self->{context}, $Self, $lcTag, $lcAttrKey, $AttrValR, $HtmlR, $OutR);
+      my $DefangResult = $Self->{attribs_callback}->($Self->{context}, $Self, $lcTag, $lcAttrKey, $AttrValR, $HtmlR, $OutR);
+      # Only use new result if not already DEFANG_ALWAYS from url_callback
+      $DefangAttrib = $DefangResult if $DefangAttrib != DEFANG_ALWAYS;
     }
 
-    if (($DefangAttrib == 2) && $AttribRule) {
+    if (($DefangAttrib == DEFANG_DEFAULT) && $AttribRule) {
       my $Rule = $Rules{$AttribRule};
       warn "defang_attributes AttribRule=$AttribRule Rule=$Rule" if $Debug;
 
       # We whitelist the attribute if the value matches the rule
-      if (ref($Rule) eq "Regexp" && $AttrValStripped =~ $Rule) {
-        $DefangAttrib = 0;
+      if (ref($Rule) eq "Regexp") {
+        $DefangAttrib = ($AttrValStripped =~ $Rule) ? DEFANG_NONE : DEFANG_ALWAYS;
+      }
+
+      # Hack. Ref to array is a blacklist regexp
+      if (ref($Rule) eq "ARRAY") {
+        $DefangAttrib = ($AttrValStripped =~ $Rule->[0]) ? DEFANG_ALWAYS : DEFANG_NONE;
       }
-      
-      # Defang all scripts in attributes
-      $DefangAttrib = $AttrValStripped =~ /^(javascript:|livescript:|mocha:|vbscript:)/i ? 1 : $DefangAttrib;
       
     } elsif (!$AttribRule)  {
-      $DefangAttrib = 1;
+      $DefangAttrib = DEFANG_ALWAYS;
     }
 
     warn "defang_attributes DefangAttrib=$DefangAttrib" if $Debug;
 
     # Store the attribute defang flag
-    push @$Attr, $DefangAttrib if $DefangAttrib;
+    push @$Attr, $DefangAttrib if $DefangAttrib != DEFANG_NONE;
 
   }
 
-  my $DefangTag = 2;
+  my $DefangTag = DEFANG_DEFAULT;
 
   # Callback if the tag is in @$tags_to_callback
   if (exists($Self->{tags_to_callback}->{$lcTag})) {
@@ -1912,7 +1941,7 @@ sub defang_attributes {
     # (attribute could be undef for buggy html, eg <ahref=blah>)
     $Attr->[0] = $Self->{defang_string}
       . ( $Attr->[0] || '' )
-      if $Attr->[7]
+      if defined($Attr->[7]) && $Attr->[7] != DEFANG_NONE
         && (
           $Self->{allow_double_defang}
           || (
@@ -1920,7 +1949,7 @@ sub defang_attributes {
               0, length( $Self->{defang_string} ) ) ne $Self->{defang_string}
           )
         );
-    # Set this to undef, or this value will appear in the output
+    # Set defang value to undef, or this value will appear in the output
     $Attr->[7] = undef;
 
     # Requote specials in attribute value
@@ -1951,8 +1980,8 @@ sub defang_attributes {
   @$Attributes = @OutputAttributes;
 
   # If its a known tag, we whitelist it
-  if ($DefangTag == 2 && (my $TagOps = $Tags{$lcTag})) {
-    $DefangTag = 0;
+  if ($DefangTag == DEFANG_DEFAULT && (my $TagOps = $Tags{$lcTag})) {
+    $DefangTag = DEFANG_NONE;
   }
   
   return $DefangTag;
@@ -2007,7 +2036,7 @@ sub track_tags {
     my ($Found, $ClosingTags) = (0, '');
 
     # Tag not even open, just defang it
-    return 1 if !$OpenedTagsCount->{$lcTag};
+    return DEFANG_ALWAYS if !$OpenedTagsCount->{$lcTag};
 
     # Check tag stack up to find mismatches
     while (@$OpenedTags) {
@@ -2035,7 +2064,7 @@ sub track_tags {
 
     # Otherwise hit tag that stops breaking out, defang it
     } else {
-      return 1;
+      return DEFANG_ALWAYS;
     }
 
   }
@@ -2066,7 +2095,7 @@ sub track_tags {
     }
   }
 
-  return 0;
+  return DEFANG_NONE;
 }
 
 sub track_tag {
@@ -2201,7 +2230,6 @@ sub get_applicable_charset {
 
   # Return fallback charset if no header or meta charset found
   return $Charset ? $Charset : shift;
-
 }
 
 =head1 SEE ALSO
@@ -312,7 +312,7 @@ like($Res, qr{^<table>
 $H = <<EOF;
 <table>
 <tr>
-<td><a>
+<td><i>
 <pre>
 </tr>
 EOF
@@ -321,7 +321,7 @@ $Res =~ s/<!--.*?-->//g;
 
 like($Res, qr{^<table>
 <tr>
-<td><a>
+<td><i>
 <pre>
 </pre></td></tr>
 </table>$}, "Add multiple missing closing tags when one closing tag and one non-callback tag is present");
@@ -413,6 +413,15 @@ $Res =~ s/<!--.*?-->//g;
 
 like($Res, qr{^<div>abc<span>def</span><p><span>abc</span></p><span>def</span>abc</div>$}, "Check close/open inline within block tags");
 
+#	$H = <<EOF;
+#	<div><i><b><p><span>abc</span></p></i></b></div>
+#	EOF
+#	$Res = $Defang->defang($H);
+#	$Res =~ s/<!--.*?-->//g;
+#	
+#	# Note: This result isn't actually quite right (should be reopened with <p><i><b><span>), but it'll do
+#	like($Res, qr{^<div><i><b></b></i><p><span><i><b>abc</b></i></span></p></div>$}, "Check close/open multiple inline within block tags");
+
 
 $Defang = HTML::Defang->new(
     fix_mismatched_tags => 1,
@@ -0,0 +1,55 @@
+#!/usr/bin/perl -w
+
+BEGIN { do '/home/mod_perl/hm/ME/FindLibs.pm'; }
+
+use utf8;
+use Test::More tests => 19;
+use HTML::Defang;
+use Encode;
+use Devel::Peek;
+use strict;
+
+my ($Res, $H);
+my ($DefangString, $CommentStartText, $CommentEndText) = ('defang_', '', '');
+
+#################################
+#  Check unicodeness is preserved despite internal non-unicode magic
+#################################
+
+my $Defang = HTML::Defang->new(
+  tags_to_callback => [ qw(a p) ],
+  tags_callback => sub {
+    my ($Context, $Defang, $Angle, $Tag, $IsEndTag, $AttributeHash, $AttributesEnd, $HtmlR, $OutR) = @_;
+    if ($Tag eq 'a' && !$IsEndTag) {
+      ok(Encode::is_utf8(${$AttributeHash->{href}}), "attr is unicode");
+      is(${$AttributeHash->{href}}, 'http://blah.com/ø', "attr unicode is correct");
+      ${$AttributeHash->{href}} = 'http://blah.com/ø';
+      ok(Encode::is_utf8(${$AttributeHash->{href}}), "attr is unicode2");
+    } elsif ($Tag eq 'p' && !$IsEndTag) {
+      ok(Encode::is_utf8($$HtmlR), "html ref is unicode");
+      ok($$HtmlR =~ /\G(?=岡)/gc, "html ref unicode is correct");
+    }
+    return 1;
+  }
+);
+$H = <<EOF;
+<p>岡</p>
+<a href="http://blah.com/ø" class="û">non-english href</a>
+EOF
+ok(Encode::is_utf8($H), "input is unicode");
+$Res = $Defang->defang($H);
+ok(Encode::is_utf8($Res), "output is unicode");
+like($Res, qr{^<!--defang_p-->岡<!--/defang_p-->}, "defang preserves unicode");
+like($Res, qr{^<!--defang_a defang_href="http://blah\.com/ø" defang_class="û"-->non-english href<!--/defang_a-->}m, "defang preserves unicode2");
+$H = <<EOF;
+<p>岡</p>
+<a href="http://blah.com/ø" class="&#251;">non-english href</a>
+<style>a { color:red&#251;; }</style>
+EOF
+ok(Encode::is_utf8($H), "input2 is unicode");
+$Res = $Defang->defang($H);
+ok(Encode::is_utf8($Res), "output2 is unicode");
+like($Res, qr{^<!--defang_p-->岡<!--/defang_p-->}, "defang2 preserves unicode");
+like($Res, qr{^<!--defang_a defang_href="http://blah\.com/ø" defang_class="û"-->non-english href<!--/defang_a-->}m, "defang2 preserves unicode2");
+like($Res, qr(^<style><!--a { /\*color:redû;\*/ }--></style>)m, "style unicode correct");
+